################################################################################
#* Master File — Housing Question Old and New: Mapping Crowding, Tenure, Rents 
#* and Segregation in the Neighborhoods of Major European Cities around 1900 
#* and Today  
#* Author: Sebastian Kohl & Ria Wilken
#*  Last updated: November, 4th 2025
################################################################################

#*******************************************************************************
#* Note
#*******************************************************************************
#*
#* 
#* 

#*******************************************************************************
#* Preparation
#*******************************************************************************


# Set working directory automatically
if (!require("here")) install.packages("here")
library(here)


# Clear environment
rm(list = ls())



source("/run my packages.R")

# Load data 
load("/final.RData")

names(dist)

# Descriptive statistics
length(unique(dist$iso))
length(unique(dist$city))
length(unique(dist$city_year))

dist1 = dist %>% group_by(city_year) %>% summarise(length(district), mittel= mean(population/length(district), na.rm = T))
sum(dist1$`length(district)`)
summary(dist1$`length(district)`)
hist(dist1$`length(district)`)
summary(dist1$mittel, na.rm = T)
hist(dist1$mittel)
mean(as.numeric(unique(dist$year)))
summary(as.numeric(unique(dist$year)))
names(dist)
missing_counts <- dist %>%  group_by(city_year) %>%   summarize(across(everything(), ~ sum(!is.na(.))))
nonzero = list()
for(i in 1:178){
  nonzero[[i]] = length(which(missing_counts[i,] !=0))
}

names(missing_counts)
table(missing_counts$dwelling_unit01)
table(missing_counts$occ_rooms01)
table(missing_counts$rent_rooms01)

length(unique(dist$city))
# sources = unique(dist[,c("city_year","source")])
# sources = sources[order(sources$city_year),]

# pop <- dist %>%  group_by(city_year) %>% summarise(population = sum(population, na.rm = T))
# sources = sources%>% left_join(pop, by = "city_year")

pop <- dist %>%  group_by(iso, city_year) %>% summarise(population = sum(population, na.rm = T))
pop <- dist %>%  group_by(iso) %>% summarise(l = length(unique(city_year)))





# Figure 1: Persons per hectare, unit, room, building 
# only cities with more than 250,000 inhabitants, no USA, NLD, GBR 
# Calculate total population by city_year
big <- dist %>%
  group_by(city_year) %>%
  summarise(city_population = sum(population, na.rm = TRUE), .groups = "drop")

# Filter original data to cities with population >= 250,000, exclude GBR, USA, NLD


big1 <- inner_join(dist, big, by="city_year")
big1<- big1%>% filter(iso!= "USA")
big1 <- big1 %>%
  filter(!(iso %in% c("GBR", "NLD") & city_population < 250000))



a = ggplot(data = big1[!is.na(big1$persons_building),],aes(persons_building, reorder(city_year, persons_building, FUN=median, na.rm=TRUE)))+
  geom_boxplot()+labs(x = "", y = "")+theme_minimal()+
  xlim(0,180)+ggtitle("Persons per building")+
  theme(axis.text=element_text(size=14), text = element_text(face = "bold"),
        plot.title = element_text(size = 20))+
  geom_vline(xintercept = median(big1$persons_building, na.rm = T), colour = "blue")
a

b = ggplot(data = big1[!is.na(big1$persons_unit),],aes(persons_unit, reorder(city_year, persons_unit,  FUN=median, na.rm=TRUE)))+
  geom_boxplot()+labs(x = "", y = "")+
  theme_minimal()+xlim(0,14)+ggtitle("Persons per unit")+
  theme(axis.text=element_text(size=12), text = element_text(face = "bold"),
        plot.title = element_text(size = 20))+
  geom_vline(xintercept = median(big1$persons_unit, na.rm = T), colour = "blue")
b
c = ggplot(data = big1[!is.na(big1$persons_room),],aes(persons_room, reorder(city_year, persons_room,  FUN=median, na.rm=TRUE)))+
  geom_boxplot()+labs(x = "", y = "")+
  theme_minimal()+ggtitle("Persons per room")+
  theme(axis.text=element_text(size=12, face = "bold"), text = element_text(face = "bold"),
        plot.title = element_text(size = 20))+
  geom_vline(xintercept = median(big1$persons_room, na.rm = T), colour = "blue")

c
d = ggplot(data = big1[!is.na(big1$persons_ha),],aes(persons_ha, reorder(city_year, persons_ha,  FUN=median, na.rm=TRUE)))+
  geom_boxplot()+labs(x = "", y = "")+theme_minimal()+ggtitle("Persons per hectare")+
  theme(axis.text=element_text(size=14, face = "bold"), text = element_text(face = "bold"),
        plot.title = element_text(size = 20))+
  geom_vline(xintercept = median(big1$persons_ha, na.rm = T), colour = "blue")
d
ggarrange(a,d)
ggarrange(b,c)



### Figure 2: Distribution of units by stories
names(dist)
dist$`4 or more` <- ifelse(
  !is.na(dist$units_stories05),
  rowSums(cbind(dist$units_stories05, dist$units_stories06, dist$stories_more), na.rm = TRUE),
  dist$units_stories04
)

dist3 <- dist%>% filter(!is.na(`4 or more`))
dist3 <- dist3%>% filter(!is.na(units_stories00))

names(dist3)
options(scipen = 999)

dist3 = dist3[, c(2,77:80, 85, 178)]
dist3 = melt(dist3, id = c("city_year", "district"))
dist3 = dist3[!is.na(dist3$value),]
dist3$variable = gsub("units_stories","", as.character(dist3$variable))
dist3 = dist3 %>% group_by(city_year, district) %>%  mutate(total = sum(value, na.rm = T))
dist3$rel_value = dist3$value/dist3$total*100
dist3$variable = gsub("units_stories","", as.character(dist3$variable))

ggplot(data = dist3[!is.na(dist3$rel_value),],aes(variable, rel_value))+
  geom_boxplot()+facet_wrap(~city_year, scales = "free")+
  labs(x="Nr of stories", y = "Percent of units in buildings of X stories")+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        text = element_text(size = 14, face = "bold"))



# Figure 3: Neighborhoods by homeownership rates and log number of housing units

ggplot(data = dist[!is.na(dist$own1),], aes(own1, log(housing.units)))+
  geom_point(aes(colour = iso, size = persons_building), alpha = 0.3)+
  scale_x_continuous(breaks = scales::pretty_breaks(n = 8), limits = c(0,70))+
  geom_text_repel(aes(label = label), max.overlaps = Inf)+
  theme_minimal()+
  theme(text = element_text(size = 14, face = "bold"))+
  geom_smooth(method = "lm")+
  labs(x = "Homeownership rate %", y = "Log number of housing units", size = "Persons/building", colour = "")



## Preparing Figure 4: Densities and rents by apartments of 
## different room numbers, 98 cities


names(dist)
dist1 = dist[, c(1,85,53:64)]
dist1 = melt(dist1, id = "city_year")
dist1 = dist1[!is.na(dist1$value),]
dist1$variable = gsub("occ_rooms","", as.character(dist1$variable))
# ggplot(data = dist1)+geom_bar(aes(value, variable), stat = "identity")+facet_wrap(~city, scales = "free")+theme_minimal()
# 
# ggplot(data = dist1)+geom_bar(aes(value, variable), stat = "identity")+facet_wrap(~city, scales = "free")
names(dist)
dist1 = dist[, c(2,5,85, 94:103)]

dist1 = melt(dist1, id = c("iso", "city_year", "district"))
dist1 = dist1[!is.na(dist1$value),]
dist1 = dist1[!is.infinite(dist1$value),]
dist1$variable = gsub("dens_rooms","", as.character(dist1$variable))
names(dist1)[5] = "density"


# Distribution of rents
names(dist)
dist2 = dist[, c(2,87,65:75)]
dist2 = melt(dist2, id = c("city_year", "district"))
dist2 = dist2[!is.na(dist2$value),]
dist2$variable = gsub("rent_rooms","", as.character(dist2$variable))
dist2$value = as.numeric(dist2$value)
dist2$number_room = as.numeric(dist2$variable)
dist2 <- dist2 %>% filter(number_room!=0)
dist2$rent_room = dist2$value/dist2$number_room
dist2 = dist2 %>% group_by(city_year, district) %>% mutate(rent_mean_district = mean(rent_room, na.rm =T)) 
dist2 = dist2 %>% group_by(city_year) %>% mutate(rent_mean_city = mean(rent_room, na.rm =T))
dist2 = dist2 %>% group_by(city_year, variable) %>% mutate(rent_mean_room_type = mean(rent_room, na.rm =T)) 
dist2$ratio_rent_city = dist2$rent_room/dist2$rent_mean_city
dist2$ratio_rent_district = dist2$rent_room/dist2$rent_mean_district
dist2$ratio_rent_room_type = dist2$rent_room/dist2$rent_mean_room_type

# Add rents
dist1 = dist1 %>% left_join(dist2, by = c("city_year","district","variable"))
dist1 = dist1[!is.na(dist1$density) & !is.infinite(dist1$density) & !is.na(dist1$city_year) & !is.na(dist1$iso),]
summary(dist1$ratio_rent_city)
hist(dist1$ratio_rent_city)
dist1 = dist1[dist1$iso !="FRA",]


ggplot(data = dist1,aes(density,variable))+
  geom_boxplot()+geom_jitter(aes(colour = ratio_rent_city), alpha = 1)+
  labs(x = "", y = "")+facet_wrap(~city_year, scales = "free")+
  theme_minimal()+labs(x = "Number of persons per room", y = "Housing units by number of rooms", colour = "Ratio of\ndistrict rent by\ncity rent") + 
  theme(text=element_text(size = 14, face = "bold"))+
  scale_colour_gradientn(colours = rainbow(5))+facet_wrap(~iso, scales = "free_x")+
  geom_smooth(aes(density, as.numeric(variable)), se = F)+xlim(1,6)




# Figure 5: Neighborhood room-specific rental burden and 
## average household income

# Rent to income
dist1 = dist[!is.na(dist$income.per.district),]
names(dist1)

ggplot(data = dist1)+
  geom_line(aes(log(dist1$income.per.district), dist1$rent_rooms00/dist1$income.per.district*100, colour = "00"))+
  geom_line(aes(log(dist1$income.per.district), dist1$rent_rooms01/dist1$income.per.district*100, colour = "01"), size=1)+
  geom_line(aes(log(dist1$income.per.district), dist1$rent_rooms02/2/dist1$income.per.district*100, colour = "02"))+
  geom_line(aes(log(dist1$income.per.district), dist1$rent_rooms03/3/dist1$income.per.district*100, colour = "03"))+
  geom_line(aes(log(dist1$income.per.district), dist1$rent_rooms04/4/dist1$income.per.district*100, colour = "04"))+
  geom_line(aes(log(dist1$income.per.district), dist1$rent_rooms05/5/dist1$income.per.district*100, colour = "05"))+
  geom_line(aes(log(dist1$income.per.district), dist1$rent_rooms06/6/dist1$income.per.district*100, colour = "06"))+
  geom_line(aes(log(dist1$income.per.district), dist1$rent_rooms07/7/dist1$income.per.district*100, colour = "07"))+
  geom_line(aes(log(dist1$income.per.district), dist1$rent_rooms08/8/dist1$income.per.district*100, colour = "08"))+
  geom_line(aes(log(dist1$income.per.district), dist1$rent_rooms09/9/dist1$income.per.district*100, colour = "09"))+
  geom_line(aes(log(dist1$income.per.district), dist1$rent_rooms10/10/dist1$income.per.district*100, colour = "10"))+
  facet_wrap(~city, scales = "free")+theme_minimal()+
  theme(text= element_text(size=16, face = "bold"))+labs(x = "Log average income", y = "% room rent to income", colour = "Units with \n rooms of...")



# Figure 6: Gini coefficient for different dimensions of 
## potential inequalities within cities across city neighborhoods


names(dist)
dist5 <- dist%>% select( -c(1,24:26, 76, 3:8, 91:107, 109:178,84, 40, 14:21))
dist6 <- dist%>% select(113:118, 41:52, 85,2)
names(dist5)
labels <- c("Area in Hectar", 
            "Number of Residential Buildings",
            "District Population",
            "Number Households",
            "Number Housing Units",
            "Rent Units",
            "Average Rent per Unit",
            "Owner Occupied Units",
            "Dwellings with one Housing Unit",
            "Dwellings with two Housing Units",
            "Dwellings with three Housing Units",
            "Dwellings with four Housing Units",
            "Dwellings with five Housing Units",
            "Dwellings with six Housing Units",
            "Dwellings with seven Housing Units",
            "Dwellings with eight Housing Units",
            "Dwellings with nine Housing Units",
            "Dwellings with ten Housing Units",
            "Dwellings with 11-20 Housing Units",
            "Dwellings with 21-50 Housing Units",
            "Housing Units with no Rooms",
            "Housing Units with one Room",
            "Housing Units with two Rooms",
            "Housing Units with three Rooms",
            "Housing Units with four Rooms",
            "Housing Units with five Rooms",
            "Housing Units with six Rooms",
            "Housing Units with seven Rooms",
            "Housing Units with eight Rooms",
            "Housing Units with nine Rooms",
            "Housing Units with ten Rooms",
            "Housing Units with more than 10 Rooms",
            "Occupants in Housing Units with no Room",
            "Occupants in Housing Units with one Room",
            "Occupants in Housing Units with two Rooms",
            "Occupants in Housing Units with three Rooms",
            "Occupants in Housing Units with four Rooms",
            "Occupants in Housing Units with five Rooms",
            "Occupants in Housing Units with six Rooms",
            "Occupants in Housing Units with seven Rooms",
            "Occupants in Housing Units with eight Rooms",
            "Occupants in Housing Units with nine Rooms",
            "Occupants in Housing Units with ten Rooms",
            "Occupants in Housing Units with more than 10 Rooms",
            "Rent for Housig Units with no Room",
            "Rent for Housig Units with one Room",
            "Rent for Housig Units with two Rooms",
            "Rent for Housig Units with three Rooms",
            "Rent for Housig Units with four Rooms",
            "Rent for Housig Units with five Rooms",
            "Rent for Housig Units with six Rooms",
            "Rent for Housig Units with seven Rooms",
            "Rent for Housig Units with eight Rooms",
            "Rent for Housig Units with nine Rooms",
            "Rent for Housig Units with ten or more Rooms", 
            "Dwellings with no Stories",
            "Dwellings with one Story",
            "Dwellings with two Stories",
            "Dwellings with three Stories",
            "Dwellings with four Stories",
            "Dwellings with five Stories",
            "Dwellings with six Stories",
            "Persons per Building",
            "Persons per Unit",
            "Number of Rooms",
            "Persons per Room",
            "Persons per Hectar",
            "Units per Building")  # Replace with actual labels

variable_names <- names(dist5)[!(names(dist5) %in% c("city_year", "district"))]  # Exclude identifier columns

for (i in seq_along(variable_names)) {
  attr(dist5[[variable_names[i]]], "label") <- labels[i]
}

# Create a label mapping for only the transformed variables
label_mapping <- data.frame(
  variable = variable_names,
  Label = labels,
  stringsAsFactors = FALSE
)


data_long <- melt(dist5, id = c("city_year","district"))
data_long$value = as.numeric(data_long$value)
data_long = data_long[!is.na(data_long$value), ]
library(ineq)
data_long1 = data_long%>% group_by(city_year, variable) %>% summarise(mean= mean(value), median = median(value), sd= sd(value), max = max(value),min = min(value), coeffvar =  sd(value, na.rm =T)/mean(value,na.rm=TRUE), n = length(value), uppquint = quantile(value,1), lowquint = quantile(value,0), gini = ineq(value, type = "Gini"), mean_median = mean(value)/median(value))

data_long1 <- merge(
  data_long1,
  label_mapping,
  by.x = "variable",  # Column in long-format data
  by.y = "variable",  # Column in label_mapping
  all.x = TRUE        # Keep all rows in the long-format data
)

# Final Plot 

ggplot(data = data_long1)+
  geom_boxplot(aes(gini, reorder(Label, gini, FUN = function(x) median(x, na.rm = TRUE))))+
  labs(x = "Gini coefficient", y = "")+xlim(0,1)+theme_minimal()+ 
  theme(text = element_text(face = "bold", size = 14))


# ggplot(data = data_long1) +
#   geom_boxplot(aes(coeffvar, reorder(Label, coeffvar, FUN = function(x) median(x, na.rm = TRUE)))) +
#   labs(x = "Variation coefficient", y = "") +
#   xlim(0, 2.5) +
#   theme_minimal() + theme(text = element_text(face = "bold", size = 14))


#### Figure 7: map plot


center = read.xlsx("/city_center.xlsx", startRow = 2)

load("/historical_city_map_data.Rdata")

install.packages("officer")
install.packages("flextable")
install.packages("docxtractr")
library(officer)
library(flextable)
library(docxtractr)
library(sf)

k3_df <- k3 %>%
  st_as_sf() %>%
  st_transform(crs = 4326) %>%
  st_cast("POLYGON")


k3 = k3 %>% left_join(center, by = "city") %>% ungroup()
cities = unique(k3$city)
plot = list()
for(i in cities){
  plot[[i]] =  ggplot(k3[k3$city == i,]) +
    geom_sf(aes(fill = persons_room), colour = "white") +
    ggtitle(i)+
    labs(x = "", y = "", colour = "") +
    scale_fill_gradientn(colours = rainbow(5))+
    geom_text_repel(aes(Longitude_centroid, Latitude_centroid, label = district))+
    geom_point(aes(lon1,lat1))+
    geom_point(aes(lon2,lat2))+
    geom_text(aes(lon1,lat1, label= "City hall"))+
    geom_text(aes(lon2,lat2, label= "Main station")) +
    labs(x = "Longitude", y = "Latitude")+
    theme_minimal() +
    theme(text = element_text(size = 18), axis.text.x=element_blank(),
          axis.text.y=element_blank()) 
  
}
for(i in 1:21){
  print(plot[[i]])
  readline(prompt = "Press [Enter] to continue...")
}

selected_plots <- plot[c(15, 13, 4, 6, 14, 20,5,2,12)]

ggsave("nine_plots.eps", plot = combined_plot, width = 17, height = 15)



# Figure 8: Inequality Comparison
# prepare comparison 1910 to 2016


names(dist)
dist1 = dist[, c(2,6,113:122, 85:87,89,90,23,108)]
dist1 = dist1 %>% left_join(dist2, by = c("city_year", "district"))


names(dist1)
structure = dist1[, c(1:13)]
density = dist1[, c(1,2,13:17,19)]
rent = dist1[, c(1,2,13,18)]

# structure set
structure_long <- melt(structure, id = c("city_year","district","year"))
structure_long$value = as.numeric(structure_long$value)
structure_long = structure_long[!is.na(structure_long$value), ]


structure_long = structure_long%>% group_by(city_year, variable, year) %>% summarise(mean= mean(value), median = median(value), sd= sd(value), max = max(value),min = min(value), coeffvar =  sd(value, na.rm =T)/mean(value,na.rm=TRUE), n = length(value), uppquint = quantile(value,1), lowquint = quantile(value,0), gini = ineq(value, type = "Gini"), mean_median = mean(value)/median(value))

structure_long = structure_long %>% group_by(variable) %>% mutate(median_gini = median(gini, na.rm = T), mean_gini = mean(gini, na.rm= T))

# rename values

structure_long <- structure_long %>% mutate(variable = recode(variable,
                                                              share_unit1= "One-Room Units",
                                                              share_unit2= "Two-Room Units",
                                                              share_unit3= "Three-Room Units",
                                                              share_unit4= "Four-Room Units",
                                                              share_unit5= "Five-Room Units",
                                                              share_unit6more= "Six or more Rooms",
                                                              dwelling_unit12="Single-family\n Buildings",
                                                              dwelling_unit39="3-9 Units\n per Building",
                                                              dwelling_unit1020="10-20 Units\n per Building",
                                                              dwelling_unit_more=">20 Units\n per Building"))

# only German cities which are in both years 

structure_german <- structure_long %>% filter(city_year== "Hamburg1910"| city_year=="Cologne1910"| city_year=="Dusseldorf1905"|city_year=="FreiburgimBreisgau1910"| 
                                                city_year=="Mannheim1910"| city_year=="Frankfurt1925"| city_year=="Kiel1925"|city_year=="Nuremberg1910"|city_year=="Berlin1910"|
                                                city_year=="Halle(Saale)1910"| city_year=="Hannover1900"| city_year=="Dresden1900"| city_year=="Leipzig1900"| city_year=="Karlsruhe1905"|
                                                city_year=='Erfurt1905'|city_year=="Wiesbaden1912"|city_year=="Ludwigshafen1925"|city_year=="F<U+00FC>rth1907"|city_year=="Augsburg1904"|
                                                city_year=="Magdeburg1900"|city_year=="Munich1900")


structure_german = structure_german %>% mutate(JAHR = 1908)
names(structure_german)
structure_german = structure_german[,c("city_year", "variable","JAHR","gini", 
                                       "coeffvar", "median_gini")]
structure_german <- structure_german %>% 
  rename(Stadt = city_year)


# density set

density_long <- melt(density, id = c("city_year","district","year"))
density_long$value = as.numeric(density_long$value)
density_long = density_long[!is.na(density_long$value), ]

density_long = density_long%>% group_by(city_year, variable, year) %>% summarise(mean= mean(value), median = median(value), sd= sd(value), max = max(value),min = min(value), coeffvar =  sd(value, na.rm =T)/mean(value,na.rm=TRUE), n = length(value), uppquint = quantile(value,1), lowquint = quantile(value,0), gini = ineq(value, type = "Gini"), mean_median = mean(value)/median(value))

density_long = density_long %>% group_by(variable) %>% mutate(median_gini = median(gini, na.rm = T), mean_gini = mean(gini, na.rm= T))

density_long <- density_long %>% mutate(variable = recode(variable, persons_room = "Persons per\n Housing Room",
                                                          persons_unit = "Persons per\n Housing Unit",
                                                          persons_building = "Persons \n per Building",
                                                          units_building="Housing Units\n per Building",
                                                          persons_ha="Persons \n per Hectar"))



density_german <- density_long %>% filter(city_year== "Hamburg1910"| city_year=="Cologne1910"| city_year=="Dusseldorf1905"|city_year=="FreiburgimBreisgau1910"| 
                                            city_year=="Mannheim1910"| city_year=="Frankfurt1925"| city_year=="Kiel1925"|city_year=="Nuremberg1910"|city_year=="Berlin1910"|
                                            city_year=="Halle(Saale)1910"| city_year=="Hannover1900"| city_year=="Dresden1900"| city_year=="Leipzig1900"| city_year=="Karlsruhe1905"|
                                            city_year=='Erfurt1905'|city_year=="Wiesbaden1912"|city_year=="Ludwigshafen1925"|city_year=="F<U+00FC>rth1907"|city_year=="Augsburg1904"|
                                            city_year=="Magdeburg1900"|city_year=="Munich1900")

density_german = density_german %>% mutate(JAHR = 1908)

density_german = density_german[,c("city_year", "variable","JAHR","gini", 
                                   "coeffvar",
                                   "median_gini")]
density_german <- density_german %>% 
  rename(Stadt = city_year)



# rent set
names(rent)
rent_long <- melt(rent, id = c("city_year","district","year"))
rent_long$value = as.numeric(rent_long$value)
rent_long = rent_long[!is.na(rent_long$value), ]

rent_long <- rent_long %>% mutate(variable = recode(variable, average.rent= "Average Rent\n per Unit"))
rent_long = rent_long%>% group_by(city_year, variable, year) %>% summarise(mean= mean(value), median = median(value), sd= sd(value), max = max(value),min = min(value), coeffvar =  sd(value, na.rm =T)/mean(value,na.rm=TRUE), n = length(value), uppquint = quantile(value,1), lowquint = quantile(value,0), gini = ineq(value, type = "Gini"), mean_median = mean(value)/median(value))

rent_long = rent_long %>% group_by(variable) %>% mutate(median_gini = median(gini, na.rm = T), mean_gini = mean(gini, na.rm= T))


rent_german <- rent_long %>% filter(city_year== "Hamburg1910"| city_year=="Cologne1910"| city_year=="Dusseldorf1905"|city_year=="FreiburgimBreisgau1910"| 
                                      city_year=="Mannheim1910"| city_year=="Frankfurt1925"| city_year=="Kiel1925"|city_year=="Nuremberg1910"|city_year=="Berlin1910"|
                                      city_year=="Halle(Saale)1910"| city_year=="Hannover1900"| city_year=="Dresden1900"| city_year=="Leipzig1900"| city_year=="Karlsruhe1905"|
                                      city_year=='Erfurt1905'|city_year=="Wiesbaden1912"|city_year=="Ludwigshafen1925"|city_year=="F<U+00FC>rth1907"|city_year=="Augsburg1904"|
                                      city_year=="Magdeburg1900"|city_year=="Munich1900")

rent_german = rent_german %>% mutate(JAHR = 1908)

rent_german = rent_german[,c("city_year", "variable","JAHR","gini", 
                             "coeffvar", "median_gini")]
rent_german <- rent_german %>% 
  rename(Stadt = city_year)



# Due to privacy restrictions, we are not allowed to share the contemporary data 
# on German Housing provided by the Innerstädtische Raumbeobachtung (IRB) as 
# well as the price and rent data provided by Immoscout. The following parts can 
# therefore not be replicated

# include IRB Data 

# IRB
# proprietory data, not for public use
# irb = read.xlsx("/Users/riawilken/Dropbox/Familienbereich/Mueller-Kohl/IRB 2002_2021/IRB_2002_2021.xlsx")
# irb1 = read.xlsx("/Users/riawilken/Dropbox/Familienbereich/Mueller-Kohl/IRB 2002_2021/IRB_2002_2021_BA.xlsx")
# umstieg = read.xlsx("/Users/riawilken/Dropbox/Familienbereich/Mueller-Kohl/IRB 2002_2021/Umstiegsschlussel_BA.xlsx")
# umstieg = umstieg[,c("IDBA", "IDIRB")]
# irb1 = irb1 %>% left_join(umstieg, by = c("IDBA"))
# irb1$GKZ = NULL
# irb1$Stadt = NULL
# irb = irb %>% left_join(irb1, by = c("IDIRB","JAHR"))
# rm(irb1, umstieg)

# irb2 = read.xlsx("/Users/riawilken/Dropbox/Familienbereich/Mueller-Kohl/IRB 2002_2021/IRB 2016/Copy of IRB_2002_2016_3.xlsx")

# options(scipen = 999)
# irb2 = irb2[,c("IDIRB","FL_GES")]
# irb2 = irb2 %>% group_by(IDIRB) %>% summarise(FL_GES = mean(FL_GES, na.rm = T))
# irb = irb %>% left_join(irb2, by = c("IDIRB"))
# rm(irb2)
# 
# irb = irb[!grepl("nicht zuord*",irb$Stadtteil),]
# 
# irb$person_building = irb$EWHG/irb$GEBGES
# irb$person_room = irb$EWHG/irb$RAEUME
# irb$person_unit = irb$EWHG/irb$WOHNGES
# irb$area_person = irb$WOHNFL/irb$EWHG
# irb$person_ha = irb$EWHG/irb$FL_GES
# 
# names(irb)
# irb$WOHN1 = irb$WOHN1/irb$WOHNGES
# irb$WOHN2 = irb$WOHN2/irb$WOHNGES
# irb$WOHN3 = irb$WOHN3/irb$WOHNGES
# irb$WOHN4 = irb$WOHN4/irb$WOHNGES
# irb$WOHN5 = irb$WOHN5/irb$WOHNGES
# irb$WOHN6UM = (irb$WOHN7UM+irb$WOHN6)/irb$WOHNGES
# irb$GEB12 = irb$GEB12/irb$GEBGES
# irb$GEB39 = irb$GEB39/irb$GEBGES
# irb$GEB1019 = irb$GEB1019/irb$GEBGES
# irb$GEB20UM = irb$GEB20UM/irb$GEBGES
# irb$ALOGES = irb$ALOGES/irb$BESGES
# summary(irb$ALOGES)
# irb$ALOSGB2 = irb$ALOSGB2/irb$BESGES
# irb$units_building = irb$WOHNGES/irb$GEBGES
# 
# names(irb)

# # remove outliers
# irb <- irb %>%
#   mutate(Stadt = recode(Stadt, "K<U+00F6>ln" = 'Koeln', "F<U+00FC>rth" = 'Fuerth', "D<U+00FC>sseldorf" =  'Duesseldorf',
#                         "M<U+00FC>lheim an der Ruhr" = "Muelheim an der Ruhr", 'L<U+00FC>beck' = "Luebeck", 'M<U+00FC>nster'= "Muenster",
#                         "M<U+00FC>nchen"="Muenchen", 'Saarbr<U+00FC>cken'="Saarbruecken", 'N<U+00FC>rnberg'='Nuernberg',"W<U+00FC>rzburg"='Wuerzburg'))
# 
# irb <- irb %>%filter(person_room < 3.0)
# irb <- irb %>%filter(person_unit < 8.0)
# irb$person_room[irb$Stadt=="Fuerth"] <- NA
# irb[irb == Inf] <- NA
# irb[irb< 0] <- NA
# irb$person_room[irb$person_room== 169.500000] <- NA
# irb$person_room[irb$person_room== 93.028571] <- NA
# irb$WOHN1[irb$WOHN1== 0.5911602210] <- NA




# # include rent data 
# # proprietory data, not for public use
# # load("/code and data/immoscout_price_dispersion.Rdata")
# names(immo)
# 
# immo16 = immo[immo$year == 2016,]
# 
# immo16 <- immo16[!grepl("(Kreis)", immo16$is24_stadt_kreis),] # include cities only
# immo16 <- immo16[!grepl("hk_preis", immo16$variable),]
# 
# 
# names(immo16) # only variables of interest
# 
# immo16=immo16[, c(1,2,9,13)]
# 
# immo16 = immo16 %>% group_by(variable) %>% mutate(median_coeff = median(coeffvar, na.rm = T), median_gini = median(gini, na.rm = T))
# 
# immo16 <- immo16 %>% rename("Stadt"="is24_stadt_kreis")
# 
# 
# ## only selected variables 
# 
# names(irb)
# 
# # density set
# dens = irb[,c("Stadt", "Stadtteil","JAHR","person_room", 
#               "person_unit","person_building", "units_building", "person_ha")]
# 
# dens = dens[dens$JAHR == 2016,]
# dens_long <- melt(dens, id = c("Stadt","Stadtteil", "JAHR"))
# dens_long$value = as.numeric(dens_long$value)
# dens_long = dens_long[!is.na(dens_long$value), ]
# 
# 
# # gini
# dens_long = dens_long%>% group_by(Stadt, variable, JAHR) %>% summarise( gini = ineq(value, type = "Gini"))
# 
# 
# #median 
# dens_long = dens_long %>% group_by(variable) %>% mutate(median_gini = median(gini, na.rm = T))
# 
# 
# 
# 
# # rename values
# 
# dens_long <- dens_long %>% mutate(variable = recode(variable, person_room = "Persons per\n Housing Room",
#                                                     person_building = "Persons \n per Building",
#                                                     person_unit = "Persons per\n Housing Unit",
#                                                     units_building="Housing Units\n per Building",
#                                                     person_ha="Persons \n per Hectar"))
# 
# # selected cities (only cities with data in 2016 AND in city district set)
# 
# 
# 
# densIRB <- dens_long %>% filter(Stadt== "Hamburg"| Stadt=="Koeln"| Stadt=="Duesseldorf"|Stadt=="Freiburg im Breisgau"| 
#                                   Stadt=="Mannheim"| Stadt=="Frankfurt am Main"| Stadt=="Kiel"|Stadt=="Leipzig"|
#                                   Stadt=="Nuernberg"|Stadt=="Berlin"|Stadt=="Karlsruhe"|Stadt=="Hannover"|
#                                   Stadt=="Halle (Saale)" | Stadt=="Dresden"| Stadt=="Fuerth"| Stadt=="Augsburg"|
#                                   Stadt=="Magdeburg"|Stadt=="Ludwigshafen am Rhein"|Stadt=='Muenchen'|Stadt=='Erfurt'|
#                                   Stadt=='Wiesbaden')
# 
# 
# all_dens <- rbind(density_german, densIRB)
# 
# # structure set 
# 
# struc = irb[,c("Stadt", "Stadtteil","JAHR","WOHN1",
#                "WOHN2","WOHN3","WOHN4","WOHN5","WOHN6UM",
#                "GEB12","GEB39","GEB1019", "GEB20UM")]
# 
# struc = struc[struc$JAHR == 2016,]
# struc_long <- melt(struc, id = c("Stadt","Stadtteil", "JAHR"))
# struc_long$value = as.numeric(struc_long$value)
# struc_long = struc_long[!is.na(struc_long$value), ]
# 
# 
# # gini
# struc_long = struc_long%>% group_by(Stadt, variable, JAHR) %>% summarise( gini = ineq(value, type = "Gini"))
# 
# 
# #median 
# struc_long = struc_long %>% group_by(variable) %>% mutate(median_gini = median(gini, na.rm = T))
# 
# 
# struc_long <- struc_long %>%
#   mutate(variable = as.character(variable)) %>%
#   mutate(variable = recode(variable,
#                            "WOHN1"   = "One-Room Units",
#                            "WOHN2"   = "Two-Room Units",
#                            "WOHN3"   = "Three-Room Units",
#                            "WOHN4"   = "Four-Room Units",
#                            "WOHN5"   = "Five-Room Units",
#                            "WOHN6UM" = "Six or more Rooms",
#                            "GEB12"   = "Single-family\n Buildings",
#                            "GEB39"   = "3-9 Units\n per Building",
#                            "GEB1019" = "10-20 Units\n per Building",
#                            "GEB20UM" = ">20 Units\n per Building"
#   ))
# 
# strucIRB <- struc_long %>% filter(Stadt== "Hamburg"| Stadt=="Koeln"| Stadt=="Duesseldorf"|Stadt=="Freiburg im Breisgau"| 
#                                     Stadt=="Mannheim"| Stadt=="Frankfurt am Main"| Stadt=="Kiel"|Stadt=="Leipzig"|
#                                     Stadt=="Nuernberg"|Stadt=="Berlin"|Stadt=="Karlsruhe"|Stadt=="Hannover"|
#                                     Stadt=="Halle (Saale)" | Stadt=="Dresden"| Stadt=="Fuerth"| Stadt=="Augsburg"|
#                                     Stadt=="Magdeburg"|Stadt=="Ludwigshafen am Rhein"|Stadt=='Muenchen'|Stadt=='Erfurt'|
#                                     Stadt=='Wiesbaden')
# 
# all_struc <- rbind(structure_german, strucIRB)
# 
# # rent set 
# 
# 
# 
# rentIRB <- immo16 %>% filter(Stadt== "Hamburg"| Stadt=="Koeln"| Stadt=="Duesseldorf"|Stadt=="Freiburg im Breisgau"| 
#                                Stadt=="Mannheim"| Stadt=="Frankfurt am Main"| Stadt=="Kiel"|Stadt=="Leipzig"|
#                                Stadt=="Nuernberg"|Stadt=="Berlin"|Stadt=="Karlsruhe"|Stadt=="Hannover"|
#                                Stadt=="Halle (Saale)" | Stadt=="Dresden"| Stadt=="Fuerth"| Stadt=="Augsburg"|
#                                Stadt=="Magdeburg"|Stadt=="Ludwigshafen am Rhein"|Stadt=='Muenchen'|Stadt=='Erfurt'|
#                                Stadt=='Wiesbaden')
# 
# rentIRB<- rentIRB %>% mutate(variable = recode(variable, 
#                                                wm_miete= "Average Rent\n per Unit"))
# 
# rentIRB$JAHR = 2016
# 
# all_rent <- rbind(rent_german, rentIRB)
# 
# # structure, density rent inequality plot comparison 1908 2016
# 
# 
# 
# a <- ggplot(all_struc, aes(gini, variable)) + 
#   geom_boxplot(aes(gini, reorder(variable, median_gini), color = factor(JAHR)))+
#   labs(x = "Gini coefficient",y = "Indicator", color= "Year")+
#   theme_minimal()+
#   theme(text = element_text(size=14, face="bold"))+ggtitle("Structure Inequality")
# a
# b <- ggplot(all_dens, aes(gini, variable)) + 
#   geom_boxplot(aes(gini, reorder(variable, median_gini),color = factor(JAHR)))+
#   labs(x = "Gini coefficient", y="", color= "Year")+theme_minimal()+
#   theme(text = element_text(size=14, face="bold"))+
#   ggtitle("Density Inequality")
# b
# 
# c <- ggplot(all_rent, aes(gini, variable)) + geom_boxplot(aes(gini, reorder(variable, median_gini),color = factor(JAHR)))+
#   labs(x = "Gini coefficient",y= "",color= "Year")+
#   theme_minimal()+
#   theme(text = element_text(size=14, face="bold"))+ggtitle("Rent Inequality")
# 
# c
# ggarrange(a,b,c, ncol = 3, common.legend = T)
# 
# 
# # Plot 8 comaprison IRB and historical data
# 
# DEU <- dist %>% filter(iso=="DEU")
# 
# names(DEU)
# 
# irb4 <- irb %>%filter(EWHG > 500)
# irb4$person_room[irb4$Stadt=="Erfurt"] <- NA
# 
# irb1 = irb4 %>% group_by(Stadt, Stadtteil) %>% summarise(across(where(is.numeric), ~ median(.x, na.rm = TRUE)))
# 
# a = ggplot(data = irb1[!is.na(irb1$person_building) & is.finite(irb1$person_building),],aes(person_building, reorder(Stadt, person_building)))+
#   geom_boxplot()+labs(x = "", y = "")+theme_minimal()+
#   theme(axis.text=element_text(size=14, face = "bold"), 
#         text = element_text(size = 14, face = "bold"),
#         plot.title = element_text(size = 20))+scale_x_continuous(breaks = scales::pretty_breaks(n = 8), limits = c(0,75))+
#   ggtitle("Persons per building")+geom_vline(xintercept = median(irb1$person_building, na.rm = T), colour = "blue")+
#   geom_vline(xintercept = median(DEU$persons_building, na.rm = T), colour = "red")
# 
# a
# 
# b = ggplot(data = irb1[!is.na(irb1$person_unit),],aes(person_unit, reorder(Stadt, person_unit)))+
#   geom_boxplot()+labs(x = "", y = "")+
#   theme_minimal()+
#   theme(axis.text=element_text(size=14, face = "bold"), 
#         text = element_text(size = 14, face = "bold"),
#         plot.title = element_text(size = 20))+scale_x_continuous(breaks = scales::pretty_breaks(n = 8), limits = c(0.1,5.5))+
#   ggtitle("Persons per unit")+geom_vline(xintercept = median(irb1$person_unit, na.rm = T), colour = "blue")+
#   geom_vline(xintercept = median(DEU$persons_unit, na.rm = T), colour = "red")
# b
# c = ggplot(data = irb1[!is.na(irb1$person_room),],aes(person_room, reorder(Stadt, person_room)))+
#   geom_boxplot()+labs(x = "", y = "")+theme_minimal()+
#   theme(axis.text=element_text(size=14, face = "bold"), 
#         text = element_text(size = 14, face = "bold"),
#         plot.title = element_text(size = 20))+
#   ggtitle("Persons per room")+scale_x_continuous(breaks = scales::pretty_breaks(n = 8), limits = c(0,3))+
#   geom_vline(xintercept = median(irb1$person_room, na.rm = T), colour = "blue")+
#   geom_vline(xintercept = median(DEU$persons_room, na.rm = T), colour = "red")
# c
# d = ggplot(data = irb1[!is.na(irb1$person_ha),],aes(person_ha, reorder(Stadt, person_ha)))+geom_boxplot()+
#   labs(x = "", y = "")+theme_minimal()+
#   theme(axis.text=element_text(size=14, face = "bold"), 
#         text = element_text(size = 14, face = "bold"),
#         plot.title = element_text(size = 20))+
#   ggtitle("Persons per hectare")+scale_x_continuous(breaks = scales::pretty_breaks(n = 8), limits = c(0.1,450))+
#   geom_vline(xintercept = median(irb1$person_ha, na.rm = T), colour = "blue")+
#   geom_vline(xintercept = median(DEU$persons_ha, na.rm = T), colour = "red")
# d
# ggarrange(a,d)
# ggarrange(c,b)




# Regression Analysis 

load("/cities_geocodes.Rdata")

# g1 = plyr:::rbind.fill(g)
# city = unique(dist$city)
# g1 = cbind(city, g1)
names(dist)
dist$city = gsub("[0-9]*","",dist$city)
dist = dist %>% left_join(g1, by = "city")
names(dist)

dist$pop_share = dist$population/dist$city_pop*100
dist = dist %>% group_by(city) %>% mutate(population_rank = rank(population), city_pop = sum(population, na.rm = T))
names(dist)
summary(dist$pop_share)
dist = dist[!is.na(dist$iso),]


reg0 = lmer(persons_room~1+(1|city)+(1|iso), data = dist)
var  = as.data.frame(VarCorr(reg0))
cbind(var$grp, round(var$sdcor/sum(var$sdcor),2)*100)
dist$lcity_spurt
reg1 = lmer(persons_room~population_rank+lunits_ha+(1|city)+(1|iso)+lon+lat+year, data = dist[!is.na(dist$persons_room),])
reg2 = lmer(persons_room~population_rank+gdp_cap1800_scaled+lgdp_spurt+(1|city)+(1|iso)+lon+lat+year, data = dist)
reg3 = lmer(persons_room~population_rank+lcity_pop+lcity_spurt+lcity_age+(1|city)+(1|iso)+lon+lat+year, data = dist)
reg4 = lmer(persons_room~population_rank+lunits_ha+gdp_cap1800_scaled+lgdp_spurt+lcity_pop+lcity_spurt+(1|city)+(1|iso)+lon+lat+year, data = dist)
stargazer(reg1,reg2,reg3,reg4, type = "text", single.row = T, out = "/neighborhood_regression_perroom_final.html", column.labels = c("Building density","Economic","Demography","Full"), covariate.labels = c("Neighb. Pop. Rank","Neighb. log units/ha","Log GDP pc 1800","Log GDP spurt","Log city pop","Log city pop spurt","Log city age","City longitude","City latitude","Census year"), dep.var.caption = "")


summary(dist$persons_unit)
summary(dist$city_spurt)

install.packages("ggpmisc")
library(ggpmisc)

ggplot(data = dist, aes(x = rank(dist$lcity_pop), y = persons_unit)) +  geom_point() +  geom_smooth(method = "lm", se = FALSE, color = "blue") + stat_poly_eq(formula = y ~ x, aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~")),parse = TRUE) 
names(dist)
ggplot(data = dist, aes(dist$lcity_pop, dist$lcity_spurt_scaled))+geom_smooth(method= "lm")+geom_point()

reg0 = lmer(persons_unit~1+(1|city)+(1|iso), data = dist)
var  = as.data.frame(VarCorr(reg0))
cbind(var$grp, round(var$sdcor/sum(var$sdcor),2)*100)
reg1 = lmer(persons_unit~population_rank+lunits_ha+(1|city)+(1|iso)+lon+lat+year, data = dist)
reg2 = lmer(persons_unit~population_rank+gdp_cap1800_scaled+lgdp_spurt+(1|city)+(1|iso)+lon+lat+year, data = dist)
reg3 = lmer(persons_unit~population_rank+lcity_pop+lcity_spurt+lcity_age+(1|city)+(1|iso)+lon+lat+year, data = dist)
reg4 = lmer(persons_unit~population_rank+lunits_ha+gdp_cap1800_scaled+lgdp_spurt+lcity_pop+lcity_spurt+(1|city)+(1|iso)+lon+lat+year, data = dist)
stargazer(reg1,reg2,reg3,reg4, type = "text", single.row = T, out = "code and data/neighborhood_regression_persons_unit_final.html", column.labels = c("Building density","Economic","Demography","Full"), dep.var.labels = "", covariate.labels = c("Neighb. Pop. Rank","Neighb. log units/ha","Log GDP pc 1800","Log GDP spurt","Log city pop","Log city pop spurt","Log city age","City longitude","City latitude","Census year"), dep.var.caption = "")


